# -*- coding:utf-8 -*-
import requests


def crawl():
    # 分析：我们直接从 Fiddler 请求中拷贝 URL 和 Headers， 右键 -> Copy -> Just Url/Headers Only
    url = "https://mp.weixin.qq.com/mp/profile_ext" \
          "?action=home" \
          "&__biz=MjM5MzgyODQxMQ==" \
          "&scene=124" \
          "&devicetype=android-24" \
          "&version=26060132" \
          "&lang=zh_CN" \
          "&nettype=WIFI" \
          "&a8scene=3" \
          "&pass_ticket=Bb1Ls%2BE9nT3lNOctnEBdSwEbyqd0OE60stKUEUaAKC0%3D" \
          "&wx_header=1"

    headers = """
Host: mp.weixin.qq.com
Connection: keep-alive
User-Agent: Mozilla/5.0 (Linux; Android 7.0; EVA-AL00 Build/HUAWEIEVA-AL00; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/6.2 TBS/043806 Mobile Safari/537.36 MicroMessenger/6.6.1.1200(0x26060132) NetType/WIFI Language/zh_CN
x-wechat-key: 003b562574905e4f20a1dd3b5b30884b4d5ab4b3cd0a1e4f40a0fae579d9c95061ceb80e3aba56136c79dec215ff50b91969036aa2ec4cd81acd0111000bce80d40f0fdca74f8ade6ed2b959cd8b148a
x-wechat-uin: MjcwMTU4MTU%3D
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,image/wxpic,image/sharpp,image/apng,*/*;q=0.8
Accept-Encoding: gzip, deflate
Accept-Language: zh-CN,en-GG;q=0.8,en-US;q=0.6
Cookie: pgv_pvi=1374336000; pgv_si=s6228196352; wxtokenkey=0a75edc94d5fc6f8b6f63b5913a641c23eb6f01bc81593e9c7e000a2100c2d68; rewardsn=ba624f494b055d5f8ed9; wxuin=0; devicetype=android-24; version=26060132; lang=zh_CN; pass_ticket=Bb1Ls+E9nT3lNOctnEBdSwEbyqd0OE60stKUEUaAKC0=; wap_sid2=CIf18AwSiAFxOFhHR3FUMnA5bmU5elc2UlNDVFFVaG1WdktIZ0JDVHhEbDh1TFlnc0xxeEo2NGt6RmNYTWJnTERQSWxBUW83dUVCTmd0NEpnTldMNWZVODAtUkdGZ0JYOWN6elB3ai00OWFoMTViLUR2aC02Zjh4MmJWclRmMTJOaGttNHc3VXJRTUFBQX5+MIXpp9MFOA1AlU4=
Q-UA2: QV=3&PL=ADR&PR=WX&PP=com.tencent.mm&PPVN=6.6.1&TBSVC=43602&CO=BK&COVC=043806&PB=GE&VE=GA&DE=PHONE&CHID=0&LCID=9422&MO= EVA-AL00 &RL=1080*1794&OS=7.0&API=24
Q-GUID: 3c75d4e0dfdbf7bc88e6faf213b788cb
Q-Auth: 31045b957cf33acf31e40be2f3e71c5217597676a9729f1b

      """
    headers = headers_to_dict(headers)
    response = requests.get(url, headers=headers, verify=False)
    print(response.text)
    if '<title>验证</title>' in response.text:
        raise Exception("获取微信公众号文章失败，可能是因为你的请求参数有误，请重新获取")
    # 我们顺带把响应结果另存为html文件，以便后面重复使用，分析里面的内容
    with open("weixin_history.html", "w", encoding="utf-8") as f:
        f.write(response.text)
    # 调用提取数据方法
    data = extract_data(response.text)
    for item in data:
        print(item)


def headers_to_dict(headers):
    """
    将字符串
    '''
    Host: mp.weixin.qq.com
    Connection: keep-alive
    Cache-Control: max-age=
    '''
    转换成字典类型
    :param headers: str
    :return: dict
    """
    headers = headers.split("\n")
    d_headers = dict()
    for h in headers:
        h = h.strip()
        if h:
            k, v = h.split(":", 1)
            d_headers[k] = v.strip()
    return d_headers


# 写一个方法提取出历史文章数据，
# 分三个步骤，首先用正则提取数据内容，然后 html 转义处理，最终得到一个列表对象，返回最近发布的10篇文章。
def extract_data(html_content):
    """
    从html页面中提取历史文章数据
    :param html_content 页面源代码
    :return: 历史文章列表
    """
    import re
    import html
    import json

    rex = "msgList = '({.*?})'"
    pattern = re.compile(pattern=rex, flags=re.S)
    match = pattern.search(html_content)
    if match:
        data = match.group(1)
        # 处理html 转义字符 html.unescape(data)
        data = html.unescape(data)
        data = json.loads(data)
        articles = data.get("list")
        for item in articles:
            print(item)
        return articles


if __name__ == '__main__':
    crawl()
